# -*- coding: utf-8 -*-
'''
Created on Aug 20, 2012

@author: LONG HOANG GIANG
'''
import sys, os
sys.path.append(os.path.expanduser('/home5/vietcntt/longhoanggiang/python'))
sys.path.append(os.path.join(os.path.dirname(__file__), '../'))
from CrawlerLib import commonlib
import datetime
import mechanize
from lxml import etree
import re

cookie = mechanize.LWPCookieJar()
br = mechanize.Browser()
br.set_cookiejar(cookie)
br.set_handle_referer(True)

def getHtml(url):
    res = br.open(url)
    return res.read()

def buildTree(url):
    html = getHtml(url)
    tree = commonlib.build_tree_from_html(html)
    return tree

def validateFirstUrl(url):
    if not re.search(r"/page\d+\.html$", url):
        url = re.sub(r"/([a-z0-9]+)\.html", "/\\1/page1.html", url)
    return url

def process(url, username):
    c = 0
    url = validateFirstUrl(url)
    htmlOfPage = '<html><head><meta charset="utf-8" /></head><body>'
    while True:
        print '> process page {0}'.format(commonlib.extractText(r"page(\d+)\.html", url, 1, '1'))
        if c>1000: break
        tree = buildTree(url)
        if tree == None: break
        nodes = tree.xpath("//div[contains(@id, 'edit')]//a[@class='bigusernames' and contains(., '{0}')]/ancestor::*[name()='tr'][2]/following-sibling::*[1]//div[contains(@id, 'post_message')]".format(username))
        print '\t{0} posts'.format(len(nodes))
        for item in nodes:
            if len(commonlib.stringify(item)) < 700: continue
            html = etree.tounicode(item).encode('utf-8')
            htmlOfPage += html + '<br /><br /><br />'
        nextPage = tree.xpath("//table/tr/td[@class='paging']/span/strong/../../following-sibling::*[1]/a")
        if len(nextPage) == 0 or nextPage == None: break
        pageNum = commonlib.stringify(nextPage)
        url = re.sub(r"/page(\d+)\.html", "/page{0}.html".format(pageNum), url)
        c += 1
    htmlOfPage += '</body></html>'
    commonlib.file_put_contents('{0}content.html'.format(FOLDER), htmlOfPage)
    return
    

if __name__ == '__main__':
    
    FOLDER = '/longhoanggiang/zingforum/'
    if not os.path.isdir(FOLDER): os.makedirs(FOLDER, 0777)
    ######################################################
    url = 'http://forum.zing.vn/dien-dan-gioi-tre/fiction-13-lo-lem-va-xa-hoi-den/t1028734/page1.html'
    username = 'sato_1999'
    ######################################################
    process(url, username)
    
    print '> Finished at {0}'.format(datetime.datetime.now())
    os._exit(1)